Digging into differences in the fall Bigelow survey data across different pulls of survdat
Objective is to isolate which main columns are inconsistent across the datasets and whether the problem is isolated to specific species.
For convenience we also limited the data to include years 2006-present.
For the following report we looked at 7 candidate survdat datasets.
To remove clutter from the following plots numbers: (1 & 4) will not be displayed as exploratory plots showed them to be “the same” as numbers: (2 & 3) respectively.
#### Group 1: 2016 & 2019 data ####
# # 2016
# load(paste0(mills_path, "Projects/WARMEM/Old survey data/Survdat_Nye2016.RData"))
# survdat_16 <- clean_names(survdat) %>% filter(est_year >= 2006)
# 2019
load(paste0(res_path, "NMFS_trawl/Survdat_Nye_allseason.RData"))
survdat_19 <- clean_names(survdat) %>% filter(est_year >= 2006)
#### group 2: 2020 Data ####
# 2020
load(paste0(res_path, "NMFS_trawl/Survdat_Nye_Aug 2020.RData"))
survdat_20 <- clean_names(survdat) %>% filter(est_year >= 2006)
# # Double check of 2020 from Janet on 01282021
# load(paste0(res_path, "NMFS_trawl/Survdat_Nye_Aug 2020_check01282021.RData"))
# survdat_20_check <- clean_names(survdat) %>% filter(est_year >= 2006)
#### Group 3: 2021 Survdat Pulls
# 2021
load(paste0(res_path, "NMFS_trawl/survdat_slucey_01152021.RData"))
survdat_21 <- clean_names(survdat) %>% filter(year >= 2006)
# February 2021 following DBI package fix
load(paste0(res_path, "NMFS_trawl/Survdat_02182021.RData"))
feb_2021 <- survdat %>% clean_names()%>%
filter(year >= 2006)
# February 2021 following DBI package fix
load(paste0(res_path, "NMFS_trawl/NEFSC_BTS_02242021.RData"))
feb_23_2021 <- survey$survdat %>% clean_names()%>%
filter(year >= 2006)
# remove survdat
rm(survdat, survey)
The cleanup function does a number of steps to each survdat source. This includes the removal of specific strata, the removal of non-spring/fall seasons, removal of vessels other than the Albatross or Bigelow, the dropping of NA biomass and abundance events, and the removal of certain species like shrimp.
It also matches up the stations with larger regions they fall within and the survey effort within each of those. This should not have an impact on any of the things we are tracking below.
The code for the cleanup function can be found here
# Cleanup functions
source(here("R/01_nefsc_ss_build.R"))
# Clean each up using the survdat prep function
# trawldat_16 <- survdat_prep(survdat = survdat_16) %>%
# mutate(survdat_source = "2016")
trawldat_19 <- survdat_prep(survdat = survdat_19) %>%
mutate(survdat_source = "2016 - 2019")
trawldat_20 <- survdat_prep(survdat = survdat_20) %>%
mutate(survdat_source = "2020")
# trawldat_20_check <- survdat_prep(survdat = survdat_20_check) %>%
# mutate(survdat_source = "2020_check")
trawldat_21 <- survdat_prep(survdat = survdat_21) %>%
mutate(survdat_source = "2021")
trawldat_feb <- survdat_prep(survdat = feb_2021) %>%
mutate(survdat_source = "Feb 2021")
trawldat_feb23 <- survdat_prep(survdat = feb_23_2021) %>%
mutate(survdat_source = "Feb 23 2021")
rm(#survdat_16,
survdat_19,
survdat_20, #survdat_20_check,
survdat_21,
feb_2021,
feb_23_2021)
The ratio of survdat$abundance and sum(survdat$numlen) represents the scale of mismatch between the abundance of a species caught at in a tow when measured using the survdat$abundance column compared to the sum of survdat$numlen across all the lengths caught.
In theory they should be equal. Numbers greater than 1 indicate that the abundance column was greater than the sum of individuals at the different lengths.
Here is a helper function that will be applied to each survdat source to pull out the ratio between numlen and abundance totals.
# This is the same code used to make numlen_adj in the build:
get_convers <- function(trawldat){
abundance_check <- trawldat %>%
group_by(id, comname, catchsex, abundance) %>%
summarise(
numlen_abund = sum(numlen),
n_len_class = n_distinct(length), # number of distinct lengths caught that station
.groups = "keep") %>%
ungroup()
# Numlen conversion factor translates each size at length to what it would need be to be
# for them to add up to that "abundance" column
conv_factor <- trawldat %>%
distinct(id, comname, catchsex, length, abundance) %>%
inner_join(abundance_check, by = c("id", "comname", "catchsex", "abundance")) %>%
mutate(convers = abundance / numlen_abund)
# Merge back and convert the numlen field
trawldat_adjusted <- trawldat %>%
left_join(conv_factor, by = c("id", "comname", "catchsex", "length", "abundance", "n_len_class")) %>%
mutate(numlen_adj = numlen * convers, .after = numlen)
return(trawldat_adjusted)
}
And some organization as a list to process them:
# Put all the groups together to compare abundance ratios
trawldat_list <- list(
# "2016" = get_convers(trawldat_16),
"2019" = get_convers(trawldat_19),
"2020" = get_convers(trawldat_20),
# "2020 re-check" = get_convers(trawldat_20_check),
"2021" = get_convers(trawldat_21),
"Feb 2021" = get_convers(trawldat_feb),
"Feb 23 2021" = get_convers(trawldat_feb23)) %>%
map(~ .x %>%
mutate(season = str_to_title(season),
season = factor(season, levels = c("Spring", "Fall"))))
# Big table of all of them
trawldat_combined <- bind_rows(
trawldat_list,
.id = "survdat_source")
trawldat_combined %>%
mutate(season = fct_rev(season)) %>%
ggplot(aes(x = convers,
y = survdat_source,
color = survdat_source)) +
geom_boxplot() +
facet_grid(season ~ svvessel) +
scale_color_gmri() +
labs(subtitle = "numlen to abundance ratio - All Species",
x = "(survdat$abundance / sum(numlen))",
y = "",
caption = "Ratios calculated at each station for each species & sex.\nMay reflect change in catch composition.")
trawldat_combined %>%
filter(comname == "haddock") %>%
mutate(season = fct_rev(season)) %>%
ggplot(aes(x = convers,
y = survdat_source,
color = survdat_source)) +
geom_boxplot() +
facet_grid(season ~ svvessel) +
scale_color_gmri() +
labs(subtitle = "numlen to abundance ratio - Haddock only",
x = "(survdat$abundance / sum(numlen))",
y = "",
caption = "Ratio value calculated at each station/sex for haddock.")
The abund/numlen ratio is helpful for identifying consistency in behavior across the different pulls, but doesn’t really explain the how/why of the differences among them.
To get at the differences that contribute to them we use four commonly caught species to compare different components of the data sets.
For each of the following species we are comparing 4 main things across the different survdat sources:
This should tell us whether some datasets are missing stations completely, and whether the differences exist in the numlen, abundance, or biomass columns, and for what time periods.
For each of the following plots the drawing order of the lines is top to bottom as they appear in the key. If lines do not appear they likely are prefectly masked by some of the more recent pulls, which is good.
Differences in datasets are more apparent in the biomass and abundance totals during the Bigelow survey years.
NOTE: The Below code chunk details the functions used to isolate a given species and pull its information for display. Subsequent tabs will detail the differences among the different survdat pulls.
# Function to pull single species data
species_select <- function(species_choice){
# Use list of sources to pull select species from each
source_list <- trawldat_list
# Use map to not repeat so much code
select_species <- map_dfr(source_list, function(surv_source){
surv_source %>%
select(survdat_source, id, est_year, season, comname, length,
numlen, abundance, biom_adj) %>%
filter(comname == species_choice)})
return(select_species)
}
#### Plotting Numlen Totals and Station counts ####
plot_stations <- function(combined_data, species_choice){
# Total numlen fish summary
species_numlens <- combined_data %>%
group_by(survdat_source, est_year, season) %>%
summarise(n_stations = n_distinct(id),
sum_numlen = sum(numlen, na.rm = T),
.groups = "keep") %>% ungroup() %>%
mutate(season = fct_rev(season))
# Plot for station presence
station_plot <- ggplot(species_numlens,
aes(est_year, n_stations, color = survdat_source)) +
geom_line() +
geom_jitter(aes(group = survdat_source, color = survdat_source, shape = survdat_source),
size = 1.5, width = 0.25) +
facet_wrap(~season) +
scale_color_gmri() +
labs(x = "",
y = "Total Stations of Species Presence",
color = "", shape = "",
subtitle = paste0(species_choice, " only")) +
theme(legend.position = "none")
# Plot for numlen
numlen_plot <- ggplot(species_numlens,
aes(est_year, sum_numlen, color = survdat_source)) +
geom_line() +
geom_jitter(aes(group = survdat_source, color = survdat_source,
shape = survdat_source),
size = 1.5, width = 0.25) +
facet_wrap(~season) +
scale_color_gmri() +
labs(x = "", y = "Total Abundance using survdat$numlen",
color = "", shape = "") +
theme(legend.position = "bottom")
plot_out <- station_plot / numlen_plot
return(plot_out)
}
#### Plotting Biomass and Abundances ####
plot_biomass <- function(combined_data, species_choice){
# Abundance and Biomass Summary
species_abund_bio <- combined_data %>%
distinct(survdat_source, est_year, season, id, abundance, biom_adj) %>%
group_by(survdat_source, est_year, season) %>%
summarise(n_stations = n_distinct(id),
abundance = sum(abundance, na.rm = T),
biomass = sum(biom_adj, na.rm = T),
.groups = "keep") %>%
ungroup() %>%
mutate(season = fct_rev(season))
# Plot for abundance
abund_plot <- ggplot(species_abund_bio,
aes(est_year, abundance, color = survdat_source)) +
geom_line() +
geom_jitter(aes(group = survdat_source, color = survdat_source, shape = survdat_source),
size = 1.5, width = 0.15) +
facet_wrap(~season) +
scale_color_gmri() +
labs(x = "", y = "Total survdat$abundance",
subtitle = paste0(species_choice, " only")) +
theme(legend.position = "none")
# plot for biomass
bio_plot <- ggplot(species_abund_bio, aes(est_year, biomass, color = survdat_source)) +
geom_line() +
geom_jitter(aes(group = survdat_source, color = survdat_source,
shape = survdat_source), size = 1.5,
width = 0.15) +
facet_wrap(~season) +
scale_color_gmri() +
labs(x = "", y = "Total survdat$biomass",
color = "", shape = "") +
theme(legend.position = "bottom")
plot_out <- abund_plot / bio_plot
return(plot_out)
}
NOTE: The following plots have had their points jittered to expose situations of overlap.
species_choice <- "haddock"
single_species <- species_select(species_choice)
plot_stations(single_species, species_choice)
plot_biomass(single_species, species_choice)
species_choice <- "spiny dogfish"
single_species <- species_select(species_choice)
plot_stations(single_species, species_choice)
plot_biomass(single_species, species_choice)
species_choice <- "atlantic cod"
single_species <- species_select(species_choice)
plot_stations(single_species, species_choice)
plot_biomass(single_species, species_choice)
species_choice <- "atlantic herring"
single_species <- species_select(species_choice)
plot_stations(single_species, species_choice)
plot_biomass(single_species, species_choice)
species_choice <- "american lobster"
single_species <- species_select(species_choice)
plot_stations(single_species, species_choice)
plot_biomass(single_species, species_choice)
This section will detail how each data set compares along a handful of common metrics. How many stations there are, how much biomass there is, which strata are present etc.
These should all be about the same, but may point us in the direction of what might be causing these differences.
trawldat_combined %>%
rename(`Survdat Source` = survdat_source) %>%
group_by(`Survdat Source`) %>%
summarise(
`n Years` = n_distinct(est_year),
`n Stations` = n_distinct(id),
`n Strata` = n_distinct(stratum),
`n Species` = n_distinct(comname)) %>%
kable() %>%
kable_styling()
| Survdat Source | n Years | n Stations | n Strata | n Species |
|---|---|---|---|---|
| 2019 | 13 | 5885 | 53 | 311 |
| 2020 | 14 | 6608 | 53 | 320 |
| 2021 | 14 | 6576 | 53 | 312 |
| Feb 2021 | 14 | 6576 | 53 | 312 |
| Feb 23 2021 | 14 | 6576 | 53 | 312 |
Based on the visualizations above there appears to be some inconsistency in the different datasets centered around data collected after 2008 on the RV Henry Bigelow.
These datasets seem to fall into 3 distinct groups when looking at abundance:
Pre-2020
2020 Survdat Nye Data + 2021 Data w/o DBI error
2021 Survdat Pull w/ DBI error
And two different groups when looking at biomass:
2021 Survdat Pull w/ DBI error
Everything else
# Pull a single species
species_choice <- "atlantic cod"
single_species <- species_select(species_choice)
# Label the groups that we identified as behaving similarly:
cod_sources <- single_species %>%
mutate(
`survdat group` = case_when(
survdat_source == "2016 - 2019" ~ "Pre-2020 Data",
str_detect(survdat_source, "2020") ~ "2020 Data",
survdat_source == "2021" ~ "2021 w/ DBI Error",
survdat_source == "Feb 2021" ~ "2021 w/o DBI Error",
survdat_source == "Feb 23 2021" ~ "Feb 2021 newest"),
`survdat group` = factor(`survdat group`,
levels = c("Pre-2020 Data",
"2020 Data",
"2021 w/ DBI Error",
"2021 w/o DBI Error",
"Feb 2021 newest"))) %>%
distinct(`survdat group`, survdat_source,
est_year, season, id, abundance, biom_adj) %>%
group_by(`survdat group`, survdat_source, est_year, season) %>%
summarise(n_stations = n_distinct(id),
abundance = sum(abundance, na.rm = T),
biomass = sum(biom_adj, na.rm = T),
.groups = "keep") %>%
ungroup() %>%
mutate(season = fct_rev(season))
#ellipse marker
mark1 <- cod_sources %>%
filter(season == "Fall", survdat_source == "Feb 23 2021", est_year == 2009)
# Abundance
cod_sources %>%
filter(season == "Fall") %>%
ggplot(aes(est_year, abundance)) +
geom_line(aes(group = survdat_source, color = `survdat group`), alpha = 0.3) +
geom_point(aes(group = survdat_source, color = `survdat group`,
shape = survdat_source), size = 1.5) +
geom_mark_ellipse(data = mark1,
aes(est_year, abundance, label = "Separation of the groups"),
label.fill = "transparent",
linetype = 2) +
scale_color_gmri() +
facet_wrap(~season) +
facet_zoom(xlim = c(2008:2011)) +
labs(x = "",
y = "Total survdat$abundance",
title = "Data Mismatches Following Vessel Switch",
subtitle = "Inconsistency in Bigelow data over time.",
caption = paste0(species_choice, " only"),
shape = "Pull Source",
color = "Consistent Grouping") +
guides("shape" = guide_legend(title.position = "top", title.hjust = 0.5, ncol = 2),
"color"= guide_legend(title.position = "top", title.hjust = 0.5, ncol = 2)) +
theme(legend.position = "bottom")
# Biomass
cod_sources %>%
filter(season == "Fall") %>%
ggplot(aes(est_year, biomass)) +
geom_line(aes(group = survdat_source, color = `survdat group`), alpha = 0.3) +
geom_point(aes(group = survdat_source, color = `survdat group`,
shape = survdat_source), size = 1.5) +
geom_mark_ellipse(data = mark1,
aes(est_year, biomass, label = "Separation of the groups"),
label.fill = "transparent",
linetype = 2) +
scale_color_gmri() +
facet_wrap(~season) +
facet_zoom(xlim = c(2007:2013)) +
labs(x = "",
y = "Total survdat$biomass",
title = "Data Mismatches Following Vessel Switch",
subtitle = "Inconsistency in Bigelow data over time.",
caption = paste0(species_choice, " only"),
shape = "Pull Source",
color = "Consistent Grouping") +
guides("shape" = guide_legend(title.position = "top", title.hjust = 0.5, ncol = 2),
"color"= guide_legend(title.position = "top", title.hjust = 0.5, ncol = 2)) +
theme(legend.position = "bottom")
For the following figures the annual totals for biomass and abundance were compared pairwise between data source using a single species to isolate if the behavior exists across them or is isolated to specific ones.
IMPORTANT Pairwise comparisons now compare to the February 23rd 2021 dataset.
#### Data Prep and Plotting Functions
# Pull a species, pivot and plot
pull_pivot_plot <- function(target_comname = "haddock", comparison = c("Jan 2021", "2016-2019")){
# Pull annual abundance and biomass totals for haddock
annual_totals <- trawldat_combined %>%
filter(survdat_source %in% c("2019", "2021", "Feb 2021", "Feb 23 2021"),
comname == target_comname) %>%
distinct(survdat_source, est_year, season, id, abundance, biom_adj) %>%
group_by(survdat_source, est_year, season) %>%
summarise(abundance = sum(abundance, na.rm = T),
biomass = sum(biom_adj, na.rm = T),
.groups = "keep") %>%
ungroup()
# Get Ratios for between sources for abundance / biomass
yr_ratios <- annual_totals %>%
pivot_wider(names_from = survdat_source,
values_from = c(abundance, biomass)) %>%
mutate(
b_ratio_febto2021 = `biomass_Feb 23 2021` / biomass_2021,
b_ratio_febto2019 = `biomass_Feb 23 2021` / biomass_2019,
a_ratio_febto2021 = `abundance_Feb 23 2021` / abundance_2021,
a_ratio_febto2019 = `abundance_Feb 23 2021` / abundance_2019)
#### Plot the biomass
ylab_b <- paste0("BIOM Ratio\nFeb 23, 2021 : ", comparison)
ylab_a <- paste0("ABUND Ratio\nFeb 23, 2021 : ", comparison)
# y column for biomass
ycol_b = switch(EXPR = comparison,
"Jan 2021" = sym("b_ratio_febto2021"),
"2016-2019" = sym("b_ratio_febto2019"))
# y column for abundance
ycol_a = switch(EXPR = comparison,
"Jan 2021" = sym("a_ratio_febto2021"),
"2016-2019" = sym("a_ratio_febto2019"))
caption <- switch(
comparison,
"Jan 2021" = "With the idea that bigelow conversions were not applied in the previous pull from Jan 15th.\nWe would assume a consistent ratio between the two sources, equaling the biomass/abundance conversion rate.",
"2016-2019" = "The idea with these is that bigelow conversions were not correctly applied, which was corrected after 2019.\nWe would still expect a consistent ratio between the two sources across years, equaling the difference in the conversions used.")
# Plot the biomass
bio_ratio <- ggplot(yr_ratios, aes(est_year)) +
geom_line(aes(est_year, !!ycol_b)) +
facet_wrap(~season) +
labs(x = "",
y = ylab_b,
title = str_to_title(target_comname),
subtitle = "Biomass Ratios")
# Plot the abundance
abund_ratio <- ggplot(yr_ratios, aes(est_year)) +
geom_line(aes(est_year, !!ycol_a)) +
facet_wrap(~season) +
labs(x = "",
y = ylab_a,
subtitle = "Abundance Ratios",
caption = caption)
plot_out <- bio_ratio / abund_ratio
return(plot_out)
}
pull_pivot_plot(target_comname = "haddock", comparison = "Jan 2021")
pull_pivot_plot(target_comname = "haddock", comparison = "2016-2019")
pull_pivot_plot(target_comname = "atlantic cod", comparison = "Jan 2021")
pull_pivot_plot(target_comname = "atlantic cod", comparison = "2016-2019")
pull_pivot_plot(target_comname = "american lobster", comparison = "Jan 2021")
pull_pivot_plot(target_comname = "american lobster", comparison = "2016-2019")
A work by Adam A. Kemberling
Akemberling@gmri.org